DATA PREPARATION

LIBRARIES

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ---------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.7
## v tidyr   0.8.1     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

IMPORT DATA

d_train <- read_csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_features_train.csv")
d_labels <- read_csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_labels_train.csv")
d_test <- read_csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_features_train.csv")

Rescale the variables so that it is all in Celcius and mm

d_train$reanalysis_dew_point_temp_k <- d_train$reanalysis_dew_point_temp_k - 273.15
d_test$reanalysis_dew_point_temp_k <- d_test$reanalysis_dew_point_temp_k - 273.15

d_train$reanalysis_air_temp_k <- d_train$reanalysis_air_temp_k - 273.15
d_test$reanalysis_air_temp_k <- d_test$reanalysis_air_temp_k - 273.15

d_train$reanalysis_max_air_temp_k <- d_train$reanalysis_max_air_temp_k - 273.15
d_test$reanalysis_max_air_temp_k <- d_test$reanalysis_max_air_temp_k - 273.15

d_train$reanalysis_min_air_temp_k <- d_train$reanalysis_min_air_temp_k - 273.15
d_test$reanalysis_min_air_temp_k <- d_test$reanalysis_min_air_temp_k - 273.15

d_train$reanalysis_avg_temp_k <- d_train$reanalysis_avg_temp_k - 273.15
d_test$reanalysis_avg_temp_k <- d_test$reanalysis_avg_temp_k - 273.15

#!!!tdtr does not appear to be in Kelvin
# d_train$reanalysis_tdtr_k <- d_train$reanalysis_tdtr_k - 273.15
# d_test$reanalysis_tdtr_k <- d_test$reanalysis_tdtr_k - 273.15

summary(d_train$reanalysis_dew_point_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   16.49   20.97   22.49   22.10   23.31   25.30      10
summary(d_train$reanalysis_air_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.49   24.51   25.50   25.55   26.68   29.05      10
summary(d_train$reanalysis_max_air_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   24.65   27.85   29.25   30.28   32.35   40.85      10
summary(d_train$reanalysis_min_air_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.75   20.75   23.05   22.57   24.75   26.75      10
summary(d_train$reanalysis_avg_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.74   25.11   26.14   26.08   27.06   29.78      10
summary(d_train$reanalysis_tdtr_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.357   2.329   2.857   4.904   7.625  16.029      10

MERGE TRAIN & LABELS

df <- merge(d_train, d_labels, by=c("city","year","weekofyear"))

FILTER BY CITY (SJ, IQ)

sj <- df[df$city == "sj",] 
iq <- df[df$city == "iq",] 

MERGE TRAIN & TEST

Merge test and train set without the total_cases

df_all <- rbind(d_train,d_test)

FILTER BY CITY (SJ_ALL, IQ_ALL)

iq_all <- df_all[df_all$city == 'iq', ]
sj_all <- df_all[df_all$city == 'sj', ]

PRELIMINARY DATA PREPARATION

This section includes importing the data, creating of new variables and establishing the dataframes for the initial analysis

Summary Stats

library(skimr)
## Warning: package 'skimr' was built under R version 3.4.4
skimmed.sj <- skim_to_wide(sj[,c(-1,-4)])
## Warning: package 'bindrcpp' was built under R version 3.4.4
skimmed.sj
## # A tibble: 23 x 13
##    type   variable   missing complete n     mean   sd    p0    p25   p50  
##    <chr>  <chr>      <chr>   <chr>    <chr> <chr>  <chr> <chr> <chr> <chr>
##  1 integ~ total_cas~ 0       936      936   "  34~ 51.38 0     "   ~ "  1~
##  2 integ~ weekofyear 0       936      936   "  26~ 15.02 1     "  1~ "  2~
##  3 integ~ year       0       936      936   1998.~ " 5.~ 1990  "199~ "199~
##  4 numer~ ndvi_ne    191     745      936   " 0.0~ " 0.~ "-0.~ " 0.~ " 0.~
##  5 numer~ ndvi_nw    49      887      936   " 0.0~ " 0.~ "-0.~ " 0.~ " 0.~
##  6 numer~ ndvi_se    19      917      936   " 0.1~ " 0.~ -0.0~ " 0.~ " 0.~
##  7 numer~ ndvi_sw    19      917      936   " 0.1~ " 0.~ -0.0~ " 0.~ " 0.~
##  8 numer~ precipita~ 9       927      936   "35.4~ "44.~ " 0 ~ " 0 ~ "20.~
##  9 numer~ reanalysi~ 6       930      936   "26.0~ " 1.~ "22.~ "25.~ "26.~
## 10 numer~ reanalysi~ 6       930      936   "26.1~ " 1.~ "22.~ "25.~ "26.~
## # ... with 13 more rows, and 3 more variables: p75 <chr>, p100 <chr>,
## #   hist <chr>
skimmed.iq <- skim_to_wide(iq[,c(-1,-4)])
skimmed.iq
## # A tibble: 23 x 13
##    type   variable   missing complete n     mean   sd    p0    p25   p50  
##    <chr>  <chr>      <chr>   <chr>    <chr> <chr>  <chr> <chr> <chr> <chr>
##  1 integ~ total_cas~ 0       520      520   "   7~ 10.77 0     "   ~ "   ~
##  2 integ~ weekofyear 0       520      520   "  26~ 15.03 1     "  1~ "  2~
##  3 integ~ year       0       520      520   "2005~ " 2.~ 2000  2002~ "200~
##  4 numer~ ndvi_ne    3       517      520   " 0.2~ " 0.~ " 0.~ " 0.~ " 0.~
##  5 numer~ ndvi_nw    3       517      520   " 0.2~ " 0.~ " 0.~ " 0.~ " 0.~
##  6 numer~ ndvi_se    3       517      520   " 0.2~ " 0.~ " 0.~ " 0.~ " 0.~
##  7 numer~ ndvi_sw    3       517      520   " 0.2~ " 0.~ " 0.~ " 0.~ " 0.~
##  8 numer~ precipita~ 4       516      520   64.25  "35.~ " 0 ~ 39.11 60.47
##  9 numer~ reanalysi~ 4       516      520   24.72  " 1.~ "21.~ 23.94 24.67
## 10 numer~ reanalysi~ 4       516      520   25.98  " 1.~ "21.~ 25.07 25.97
## # ... with 13 more rows, and 3 more variables: p75 <chr>, p100 <chr>,
## #   hist <chr>
skimmed.sj_all <- skim_to_wide(sj_all[,c(-1,-4)])
skimmed.sj_all
## # A tibble: 22 x 13
##    type   variable   missing complete n     mean   sd    p0    p25   p50  
##    <chr>  <chr>      <chr>   <chr>    <chr> <chr>  <chr> <chr> <chr> <chr>
##  1 integ~ weekofyear 0       1872     1872  "  26~ 15.02 1     "  1~ "  2~
##  2 integ~ year       0       1872     1872  1998.~ " 5.~ 1990  "199~ "199~
##  3 numer~ ndvi_ne    382     1490     1872  " 0.0~ " 0.~ "-0.~ " 0.~ " 0.~
##  4 numer~ ndvi_nw    98      1774     1872  " 0.0~ " 0.~ "-0.~ " 0.~ " 0.~
##  5 numer~ ndvi_se    38      1834     1872  " 0.1~ " 0.~ -0.0~ " 0.~ " 0.~
##  6 numer~ ndvi_sw    38      1834     1872  " 0.1~ " 0.~ -0.0~ " 0.~ " 0.~
##  7 numer~ precipita~ 18      1854     1872  "35.4~ "44.~ " 0 ~ " 0 ~ "20.~
##  8 numer~ reanalysi~ 12      1860     1872  "26.0~ " 1.~ "22.~ "25.~ "26.~
##  9 numer~ reanalysi~ 12      1860     1872  "26.1~ " 1.~ "22.~ "25.~ "26.~
## 10 numer~ reanalysi~ 12      1860     1872  "21.9~ " 1.~ "16.~ "20.~ "22.~
## # ... with 12 more rows, and 3 more variables: p75 <chr>, p100 <chr>,
## #   hist <chr>
skimmed.iq_all <- skim_to_wide(iq_all[,c(-1,-4)])
skimmed.iq_all
## # A tibble: 22 x 13
##    type   variable    missing complete n     mean  sd    p0    p25   p50  
##    <chr>  <chr>       <chr>   <chr>    <chr> <chr> <chr> <chr> <chr> <chr>
##  1 integ~ weekofyear  0       1040     1040  "  2~ 15.02 1     "  1~ "  2~
##  2 integ~ year        0       1040     1040  "200~ " 2.~ 2000  2002~ "200~
##  3 numer~ ndvi_ne     6       1034     1040  " 0.~ " 0.~ " 0.~ " 0.~ " 0.~
##  4 numer~ ndvi_nw     6       1034     1040  " 0.~ " 0.~ " 0.~ " 0.~ " 0.~
##  5 numer~ ndvi_se     6       1034     1040  " 0.~ " 0.~ " 0.~ " 0.~ " 0.~
##  6 numer~ ndvi_sw     6       1034     1040  " 0.~ " 0.~ " 0.~ " 0.~ " 0.~
##  7 numer~ precipitat~ 8       1032     1040  64.25 "35.~ " 0 ~ 39.11 60.47
##  8 numer~ reanalysis~ 8       1032     1040  24.72 " 1.~ "21.~ 23.94 24.67
##  9 numer~ reanalysis~ 8       1032     1040  25.98 " 1.~ "21.~ 25.07 25.97
## 10 numer~ reanalysis~ 8       1032     1040  22.34 " 1.~ "16.~ 21.44 "22.~
## # ... with 12 more rows, and 3 more variables: p75 <chr>, p100 <chr>,
## #   hist <chr>
rm(skimmed.sj, skimmed.iq, skimmed.iq_all, skimmed.sj_all)

DATAFRAME CLEANUP 1

Clean up all the extra dataframes produced during the exploratory analysis

#rm(d_test,
   # d_train,
   # dengue_labels_train,
   # sj_test,
   # sj_features_train,
   # sj_labels_train,
   # iq_test,
   # iq_features_train,
   # iq_labels_train,
   # df,
   # iq,
   # sj,
   # df,
   # submission_format
   #  )

INITIAL & EXPLORATORY ANALYSIS

In this section, we summary the value of the data frames (together and by city). We also create the following graphs

  1. Frequency histograms
  2. Bivariate analysis - line graphs for time analysis
  3. Bivariate analysis - scatterplot for total_cases by other variables
  4. Wilcoxon test for test of means between cities

Compare the means between same variables in different cities

We can see that the same feature is significantly different in each city

cnames <- colnames(sj)
for (i in 5:(ncol(sj))){
  wilt <- wilcox.test(sj[,i],iq[,i])
  print(cnames[i])
  print(wilt)
}
## [1] "ndvi_ne"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 21691, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "ndvi_nw"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 32596, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "ndvi_se"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 107990, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "ndvi_sw"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 78560, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "precipitation_amt_mm"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 118470, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_air_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 369950, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_avg_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 255790, p-value = 0.03716
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_dew_point_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 208230, p-value = 3.071e-05
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_max_air_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 4645.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_min_air_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 474700, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_precip_amt_kg_per_m2"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 139740, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_relative_humidity_percent"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 62770, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_sat_precip_amt_mm"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 118470, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_specific_humidity_g_per_kg"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 192510, p-value = 4.502e-10
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_tdtr_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 22, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_avg_temp_c"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 183690, p-value = 1.887e-08
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_diur_temp_rng_c"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 6834, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_max_temp_c"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 59998, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_min_temp_c"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 361870, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_precip_mm"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 142000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "total_cases"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj[, i] and iq[, i]
## W = 401310, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
rm(cnames, i, wilt)

GRAPH: Frequency histogram of all variables in training set for SJ

Same as above but only for SJ.

cnames <- colnames(sj) 
par(mfrow=c(1,2))
for (i in 5:ncol(sj)) {
  hist(sj[,i], 
      breaks = 20,
      xlab = cnames[i], 
      main = paste(cnames[i], sep = ": "))
}

rm(cnames, i)

GRAPH: Frequency histogram of all variables in training set for IQ

Same as above but only for IQ.

cnames <- colnames(df) 
par(mfrow=c(1,2))
for (i in 5:(ncol(df))) {
 hist(df[df$city == "iq",i],
      breaks = 20,
      xlab = cnames[i],
      main = paste("Freq Histogram for IQ", cnames[i], sep = ": "))
}

rm(cnames, i)

GRAPH: Climate variables by time for SJ

Includes all the data from test and training set by time for SJ therefore the total_cases in not included. Total_cases by time is done separately.

cnames <- colnames(sj_all) 
par(mfrow=c(2,2))
for (i in 5:(ncol(sj_all))) {
  gg1 <- ggplot(sj_all,
                aes(x=week_start_date, 
                    y = sj_all[,i])) +
    geom_line() +
    ylab(cnames[i]) +
    ggtitle(paste(cnames[i])) 

    print(gg1)
  }
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

rm(cnames, i, gg1)

GRAPH: Climate variables by time for IQ

Includes all the data from test and training set by time for I therefore the total_cases in not included. Total_cases by time is done separately.

cnames <- colnames(iq_all) 
par(mfrow=c(2,2))
for (i in 5:(ncol(iq_all))) {
  gg1 <- ggplot(iq_all,
                aes(x=week_start_date, 
                    y = iq_all[,i])) +
    geom_line() +
    ylab(cnames[i]) +
    ggtitle(paste(cnames[i])) 

    print(gg1)
  }
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.

rm(cnames, i, gg1)

GRAPH: Climate variables by week for SJ

Includes all the data from test and training set by time for SJ therefore the total_cases in not included. Total_cases by time is done separately.

library(ggplot2)

cnames <- colnames(sj_all) 
par(mfrow=c(2,2))
for (i in 5:(ncol(sj_all))) {
  gg1 <- ggplot(sj,
                aes(x=weekofyear, 
                    y = sj_all[,i], 
                    group = weekofyear)) +
    geom_boxplot() +
    scale_x_continuous(breaks=seq(1,52,2)) +
    ylab(cnames[i]) +
    ggtitle(paste(cnames[i])) 

    print(gg1)
  }
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 382 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 98 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 38 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 38 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 18 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 18 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

rm(cnames, i, gg1)

GRAPH: Climate variables by week for IQ

Includes all the data from test and training set by time for I therefore the total_cases in not included. Total_cases by time is done separately.

library(ggplot2)

cnames <- colnames(iq_all) 
par(mfrow=c(2,2))
for (i in 5:(ncol(iq_all))) {
  gg1 <- ggplot(iq_all,
                aes(x=weekofyear, 
                    y = iq_all[,i], 
                    group = weekofyear)) +
    geom_boxplot() +
    scale_x_continuous(breaks=seq(1,52,2)) +
    ylab(cnames[i]) +
    ggtitle(paste(cnames[i])) 

    print(gg1)
  }
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 74 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 74 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).

## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 32 rows containing non-finite values (stat_boxplot).

rm(cnames, i, gg1)

GRAPH: Total_cases by time for SJ, IQ

Line graph of all data by total cases. This uses only the training set.

library(ggplot2)

par(mfcol=c(1,3))

#Dengue Cases for San Jose
ggplot(data = df[df$city == "sj",], aes(x=week_start_date, y=total_cases)) +
       geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Total Dengue Cases in San Jose",
       subtitle = paste(min(df$week_start_date[df$city == "sj"]),max(df$week_start_date[df$city == "sj"]), sep = " to "),
       x = "Date", y = "Total dengue cases")

# Dengue Cases for Iquitos
ggplot(data = df[df$city == "iq",], aes(x=week_start_date, y=total_cases)) +
       geom_bar(stat = "identity", fill = "green") +
  labs(title = "Total Dengue Cases in Iquitos",
       subtitle = paste(min(df$week_start_date[df$city == "iq"]),max(df$week_start_date[df$city == "iq"]), sep = " to "),
       x = "Date", y = "Total dengue cases")

GRAPH: Average Total_cases by week for SJ, IQ

Line graph of all data by total cases. This uses only the training set.

library(ggplot2)

gg1 <- ggplot(sj,
                aes(x=weekofyear, 
                    y = total_cases, 
                    group = weekofyear)) +
    geom_boxplot() +
    scale_x_continuous(breaks=seq(1,52,1)) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") +
    ylab("Total cases") +
    ggtitle(paste("Boxplot: Total cases by Week for SJ")) 

    print(gg1)

gg3 <- ggplot(data=sj, aes(x=weekofyear, y=total_cases)) +
  geom_bar(stat="summary", fun.y = "mean") +
  ggtitle(paste("Bar graph: Average total cases by Week for SJ")) +
  scale_x_continuous(breaks = seq(1,52, 2))

print(gg3)

gg2 <- ggplot(iq,
                aes(x=weekofyear, 
                    y = total_cases, 
                    group = weekofyear)) +
    geom_boxplot() +
    scale_x_continuous(breaks=seq(1,52,1)) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") +
    ylab("Total cases") +
    ggtitle(paste("Boxplot: Total cases by Week for IQ")) 

    print(gg2)

gg4 <- ggplot(data=iq, aes(x=weekofyear, y=total_cases)) +
  geom_bar(stat="summary", fun.y = "mean") +
  ggtitle(paste("Bar graph: Average total cases by Week for IQ")) +
  scale_x_continuous(breaks = seq(1,52, 2))

print(gg4)

    rm(gg1, gg2, gg3, gg4)

GRAPH: Total_cases by climate variables for SJ

Same as above but for SJ

cnames <- colnames(df) 
par(mfrow=c(2,2))
for (i in 5:(ncol(df)-1)) {
 plot(df$total_cases[df$city == "sj"],
      df[df$city == "sj",i], 
      cex = 0.5, 
      pch = 19,
      ylim = c(min(df[,i],na.rm=TRUE), max(df[,i],na.rm=TRUE)),
      main = paste("Total_cases for SJ by climate variables", cnames[i], sep = ": "),
      ylab = cnames[i])
 
}

rm(cnames, i)

GRAPH: Total_cases by climate variables for IQ

Same as above but for IQ.

cnames <- colnames(df) 
par(mfrow=c(2,2))
for (i in 5:(ncol(df)-1)) {
 plot(df$total_cases[df$city == "iq"],
      df[df$city == "iq",i], 
      cex = 0.5, 
      pch = 19,
      ylim = c(min(df[,i],na.rm=TRUE), max(df[,i],na.rm=TRUE)),
      main = paste("Total_cases for IQ by climate variables", cnames[i], sep = ": "),
      ylab = cnames[i])
 
}

rm(cnames, i)

Compare similar variable values within the dataset

There are several variables which appear to be the same feature but taken from a different source. For example, station_precip_mm and precipitation_amt_mm and reanalysis_sat_precip_amt_mm all appear to be the same “Total Precipitation value” Only one should be kept if they are the same.

Difference in max air temp

“station_max_temp_c”" and “reanalysis_max_air_temp_k” (scaled to Celcius)

library(ggplot2)

#generate a difference in max temp variable
sj$max_air_diff <- sj$station_max_temp_c - sj$reanalysis_max_air_temp_k

#barplot the difference by year
ggplot(sj,aes(x=year, y=max_air_diff))+
  geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).

#box plot difference by year
ggplot(sj, aes(x=year, y = max_air_diff, group = year)) +   geom_boxplot() 
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#Add month to the dataframe
sj$month <- as.POSIXlt(sj$week_start_date)$mon +1

#box plot difference by month
ggplot(sj, aes(x=month, y = max_air_diff, group = month)) +   geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

sj$max_air_diff <- NULL
sj$month <- NULL

Difference in min air temp

“station_min_temp_c”" and “reanalysis_min_air_temp_k” (scaled to Celcius)

library(ggplot2)

#generate a difference in max temp variable
sj$min_air_diff <- sj$station_min_temp_c - sj$reanalysis_min_air_temp_k

#barplot the difference by year
ggplot(sj,aes(x=year, y=min_air_diff))+
  geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).

#box plot difference by year
ggplot(sj, aes(x=year, y = min_air_diff, group = year)) +   geom_boxplot() 
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#Add month to the dataframe
sj$month <- as.POSIXlt(sj$week_start_date)$mon +1

#box plot difference by month
ggplot(sj, aes(x=month, y = min_air_diff, group = month)) +   geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

sj$min_air_diff <- NULL
sj$month <- NULL

Difference in average air temp

“station_avg_temp_c”" and “reanalysis_avg_temp_k” (scaled to Celcius)

library(ggplot2)

#generate a difference in max temp variable
sj$avg_air_diff <- sj$station_avg_temp_c - sj$reanalysis_avg_temp_k

#barplot the difference by year
ggplot(sj,aes(x=year, y=avg_air_diff))+
  geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).

#box plot difference by year
ggplot(sj, aes(x=year, y = avg_air_diff, group = year)) +   geom_boxplot() 
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#Add month to the dataframe
sj$month <- as.POSIXlt(sj$week_start_date)$mon +1

#box plot difference by month
ggplot(sj, aes(x=month, y = avg_air_diff, group = month)) +   geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

sj$avg_air_diff <- NULL
sj$month <- NULL

Difference in total precipitation

“station_precip_mm”, “precipitation_amt_mm”, “reanalysis_sat_precip_amt_mm”, “reanalysis_precip_amt_kg_per_m2”

library(ggplot2)

precip <- c("station_precip_mm", "precipitation_amt_mm", "reanalysis_sat_precip_amt_mm", "reanalysis_precip_amt_kg_per_m2")

#Add month to the dataframe
sj$month <- as.POSIXlt(sj$week_start_date)$mon +1



for (i in 1:3){
  par(mfrow=c(1,3))
  #generate the first variable in the list
  p1 <- precip[i]
  ind1 <- which(colnames(sj)==p1)
  for (j in ((i+1):4)){
    #generate the next variable in the list
    p2 <- precip[j]
  ind2 <- which(colnames(sj)==p2)
  #generate a difference variable 
   sj$diff <- sj[,ind1] - sj[,ind2]
   
   #barplot the difference by year
   gg1 <-ggplot(sj,
                 aes(x=year, y=diff))+
      geom_bar(stat = "identity", fill="steelblue") + 
      ggtitle(paste(p1, "&", p2))
    print(gg1)
    
    #box plot the difference by year
   gg2 <-ggplot(sj,
                 aes(x=year, y=diff, group = year)) +
      geom_boxplot() + 
      ggtitle(paste(p1, "&", p2))
    print(gg2)
    
    #box plot difference by month
    gg3 <- ggplot(sj, 
                  aes(x=month, y = diff, group = month)) +
      geom_boxplot() +
      scale_x_continuous(breaks=seq(1,12,1)) +
      ggtitle(paste(p1, "&", p2))
    print(gg3)
  }
}
## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 6 rows containing missing values (position_stack).

## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

sj$diff <- NULL
sj$month <- NULL 

rm(gg1, gg2, gg3, i, ind1, ind2, j, p1, p2, precip)

ANALYSIS OF OUTLIERS

GRAPH: Boxplot of climate variables (test and train)

Boxplot includes test and training set - NA still included

library(ggplot2)
cnames <- colnames(df) 
for (i in 5:(ncol(df))) {
 p <- ggplot(df, aes(x=city, y = df[,i], fill = city)) + 
  geom_boxplot() +
   labs(title = "Boxplot of climate variables",
       subtitle = cnames[i],
       x = "City", y = cnames[i])
 print(p)
}
## Warning: Removed 194 rows containing non-finite values (stat_boxplot).

## Warning: Removed 52 rows containing non-finite values (stat_boxplot).

## Warning: Removed 22 rows containing non-finite values (stat_boxplot).

## Warning: Removed 22 rows containing non-finite values (stat_boxplot).

## Warning: Removed 13 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 13 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Warning: Removed 43 rows containing non-finite values (stat_boxplot).

## Warning: Removed 43 rows containing non-finite values (stat_boxplot).

## Warning: Removed 20 rows containing non-finite values (stat_boxplot).

## Warning: Removed 14 rows containing non-finite values (stat_boxplot).

## Warning: Removed 22 rows containing non-finite values (stat_boxplot).

rm(cnames, i, p)

GRAPH: Boxplot of total cases

library(ggplot2)
ggplot(df, aes(x=city, y = total_cases, fill = city)) + 
  geom_boxplot() +
   labs(title = "Boxplot of Total_cases",
       x = "City", y = "Total_cases")